Preparation
##### Load required R libraries
library(Seurat)
require(scales)
require(ggplot2)
require(viridis)
require(dplyr)
require(GeneTrajectory)
require(Matrix)
require(plot3D)
library(openxlsx)
Loading preprocessed data
setwd("/data/sci-fate_analysis/data/")
data_S <- readRDS("/data/sci-fate_analysis/data/data_S_scifate_CC_GR.rds")
DimPlot(data_S, group.by = "treatment_time", reduction = "umap.orig")

DimPlot(data_S, group.by = "treatment_time", reduction = "umap.new.GR")

TF_gene_summary <- read.xlsx("../41587_2020_480_MOESM3_ESM.xlsx", sheet = 2)
colnames(TF_gene_summary) <- TF_gene_summary[1,]
TF_gene_summary <- TF_gene_summary[-1,]
head(TF_gene_summary)
## TF linked_gene linked_gene_id TF_link Regression coefficient
## 2 AR LIN7A ENSG00000111052.3 AR_LIN7A 5.9163964032391797E-2
## 3 AR GPC6 ENSG00000183098.6 AR_GPC6 4.7126417860035602E-2
## 4 AR BAMBI ENSG00000095739.7 AR_BAMBI 4.7107661847319897E-2
## 5 AR SYT1 ENSG00000067715.9 AR_SYT1 4.2513331324419798E-2
## 6 AR ARHGAP24 ENSG00000138639.13 AR_ARHGAP24 4.20068808011929E-2
## 7 AR RBMS3 ENSG00000144642.16 AR_RBMS3 4.1794736793863203E-2
## group
## 2 Activating links by newly synthesized mRNA
## 3 Activating links by newly synthesized mRNA
## 4 Activating links by newly synthesized mRNA
## 5 Activating links by newly synthesized mRNA
## 6 Activating links by newly synthesized mRNA
## 7 Activating links by newly synthesized mRNA
length(unique(TF_gene_summary$TF))
## [1] 60
CC_TFs <- c("E2F1","E2F2","E2F7","EZH2","BRCA1","E2F8","FOXM1","MYBL2")
GR_TFs <- c("BCL6", "FOXO1", "CEBPB", "GTF2IRD1", "JUNB", "RARB", "THRA")
CC_genes <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF %in% CC_TFs)])
GR_genes <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF %in% GR_TFs)])
all_genes <- unique(c(CC_genes, GR_genes))
length(all_genes)
## [1] 674
Gene-gene distance computation
Prepare the input for gene-gene Wasserstein distance
computation
data_S <- GeneTrajectory::RunDM(data_S, reduction = "pca.joint", dims = 1:20)
cell.graph.dist <- GetGraphDistance(data_S, K = 5)
## Constructing kNN graph
## Constructing graph distance matrix
## The largest graph distance is 46
genes <- all_genes
cg_output <- CoarseGrain(data_S, cell.graph.dist, genes, N = 1000, assay = "RNA_new")
## Run k-means clustering
## Coarse-grain matrices
#remove the gene with zero count
which(apply(cg_output[["gene.expression"]], 2, sum) == 0)
## ENSG00000268942.1
## 429
#####Save the files for computation in Python
dir.path <- "/banach1/rq25/tmp/sci-fate_example/"
dir.create(dir.path)
write.table(cg_output[["features"]][-which(apply(cg_output[["gene.expression"]], 2, sum) == 0)], paste0(dir.path, "gene_names.csv"), row.names = F, col.names = F, sep = ",")
write.table(cg_output[["graph.dist"]], paste0(dir.path, "ot_cost.csv"), row.names = F, col.names = F, sep = ",")
Matrix::writeMM(Matrix(cg_output[["gene.expression"]][,-which(apply(cg_output[["gene.expression"]], 2, sum) == 0)], sparse = T), paste0(dir.path, "gene_expression.mtx"))
## NULL
Wasserstein distance computation in Python
# Make sure to install the latest version of POT module (python), using the following:
# pip install -U https://github.com/PythonOT/POT/archive/master.zip
# Run the following command to get the gene-gene Wasserstein distance matrix
#system(sprintf("nohup /data/rihao/anaconda3/bin/python /data/rihao/GeneTrajectory/GeneTrajectory/python/gene_distance_cal_parallel.py %s &", dir.path))
Gene trajectory inference and visualization
setwd("/data/sci-fate_analysis/data/")
dir.path <- "./GT_CC_GR_genes_DM5_K5_N1000_RNAnew/"
gene.dist.mat.all <- LoadGeneDistMat(dir.path, file_name = "emd.csv")
gene.dist.mat <- gene.dist.mat.all
gene_embedding <- GetGeneEmbedding(gene.dist.mat, K = 5)$diffu.emb
colvar <- rep(0, nrow(gene_embedding))
colvar[which(rownames(gene_embedding) %in% CC_genes)] <- 1
scatter3D(gene_embedding[,1],
gene_embedding[,2],
gene_embedding[,3],
bty = "b2", colvar = colvar,
main = "trajectory", pch = 19, cex = 1, theta = 45, phi = 0,
col = ramp.col(c(hue_pal()(3))))

Visualize gene bin plots
##### Focus on genes expressed in 10%-75% cells
assay <- "RNA_new"
DefaultAssay(data_S) <- assay
all_genes <- unique(c(CC_genes, GR_genes))
expr_percent <- apply(as.matrix(data_S[[assay]]$data[all_genes, ]) > 0, 1, sum)/ncol(data_S)
genes <- all_genes[which(expr_percent > 0.1 & expr_percent < 0.75)]
length(genes)
## [1] 618
gene.dist.mat.all <- gene.dist.mat.all[genes, genes]
##### Define CC gene ordering
gene.dist.mat <- gene.dist.mat.all[which(rownames(gene.dist.mat.all) %in% c(CC_genes)),
which(rownames(gene.dist.mat.all) %in% c(CC_genes))]
gene_embedding <- GetGeneEmbedding(gene.dist.mat, K = 5)$diffu.emb
gene_trajectory <- ExtractGeneTrajectory(gene_embedding, gene.dist.mat, N = 1, t.list = c(1000), K = 5)
## ENSG00000187837.2
gene_trajectory_CC_RNA_new <- gene_trajectory
##### Define GR gene ordering
gene.dist.mat <- gene.dist.mat.all[which(rownames(gene.dist.mat.all) %in% c(GR_genes)),
which(rownames(gene.dist.mat.all) %in% c(GR_genes))]
gene_embedding <- GetGeneEmbedding(gene.dist.mat, K = 5)$diffu.emb
gene_trajectory <- ExtractGeneTrajectory(gene_embedding, gene.dist.mat, N = 1, t.list = c(1000), K = 5)
## ENSG00000163535.13
gene_trajectory_GR_RNA_new <- gene_trajectory
Nbin <- 7
##### Visualize CC gene plots
gene_trajectory_CC_RNA_new -> gene_trajectory
data_S <- AddGeneBinScore(data_S, gene_trajectory, N.bin = Nbin, trajectories = 1:1, assay = "RNA_new")
FeaturePlot(data_S, pt.size = 0.05, features = paste0("Trajectory",1,"_genes", 1:Nbin), ncol = Nbin, order = F, reduction = "umap.orig") &
scale_color_gradientn(colors = rev(brewer_pal(palette = "RdYlBu")(10))) & NoLegend() & NoAxes()
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

##### Visualize GR gene plots
gene_trajectory_GR_RNA_new -> gene_trajectory
data_S <- AddGeneBinScore(data_S, gene_trajectory, N.bin = Nbin, trajectories = 1:1, reverse = T, assay = "RNA_new")
FeaturePlot(data_S, pt.size = 0.05, features = paste0("Trajectory",1,"_genes", 1:Nbin), ncol = Nbin, order = F, reduction = "umap.new.GR") &
scale_color_gradientn(colors = rev(brewer_pal(palette = "RdYlBu")(10))) & NoLegend() & NoAxes()
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

Visualize gene distribution plots for CC process
##### CC gene distribution plots
gene_trajectory_CC_RNA_new -> gene_trajectory
gene_trajectory <- gene_trajectory[order(gene_trajectory$Pseudoorder1),]
CC_TFs <- c("E2F1","E2F2","E2F7","EZH2","BRCA1","E2F8","FOXM1","MYBL2")
CC_TF_gene_lists <- list()
for (i in 1:length(CC_TFs)){
CC_TF_gene_lists[[CC_TFs[i]]] <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == CC_TFs[i])])
}
freq_table <- table(do.call(c,CC_TF_gene_lists))
freq = as.integer(freq_table)
names(freq) = names(freq_table)
ggplot_list <- list()
for (i in 1:length(CC_TFs)){
gene_set <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == CC_TFs[i])])
CC_genes_df <- data.frame(genes = rownames(gene_trajectory),
group = NA)
CC_genes_df$freq <- freq[rownames(gene_trajectory)]
CC_genes_df$group[which(CC_genes_df$genes %in% gene_set)] = CC_TFs[i]
CC_genes_df$index <- 1:nrow(CC_genes_df)
ggplot_list[[i]] <-
ggplot(CC_genes_df %>% filter(!is.na(group) & freq <= 100), aes(x = index, fill = group)) +
geom_density(linetype = 1, alpha = 0.75) + xlim(c(1, nrow(gene_trajectory))) +
scale_fill_manual(values = c("darkblue", "orange", "forestgreen"))+ #NoLegend()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
ggtitle(CC_TFs[i])
}
print(ggpubr::ggarrange(plotlist = ggplot_list,
ncol = 4, nrow = 2, align = "v"))

Visualize gene distribution plots for CC process: E2F1, BRCA1,
FOXM1
CC_TFs2 <- c("E2F1","BRCA1","FOXM1")
CC_TF_gene_lists <- list()
for (i in 1:length(CC_TFs2)){
CC_TF_gene_lists[[CC_TFs2[i]]] <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == CC_TFs2[i])])
}
freq_table <- table(do.call(c,CC_TF_gene_lists))
freq = as.integer(freq_table)
names(freq) = names(freq_table)
CC_genes_df <- data.frame(genes = rownames(gene_trajectory),
group = NA)
CC_genes_df$freq <- freq[rownames(gene_trajectory)]
gene_set <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == CC_TFs2[1])])
CC_genes_df$group[which(CC_genes_df$genes %in% gene_set)] = CC_TFs2[1]
gene_set <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == CC_TFs2[2])])
CC_genes_df$group[which(CC_genes_df$genes %in% gene_set)] = CC_TFs2[2]
gene_set <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == CC_TFs2[3])])
CC_genes_df$group[which(CC_genes_df$genes %in% gene_set)] = CC_TFs2[3]
CC_genes_df$index <- 1:nrow(CC_genes_df)
p <-
ggplot(CC_genes_df %>% filter(!is.na(group) & freq ==1), aes(x = index, fill = group)) +
geom_density(linetype = 1, alpha = 0.75) + xlim(c(1, nrow(gene_trajectory))) +
scale_fill_manual(values = c("purple", "orange", "forestgreen"))+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
p

Visualize gene distribution plots for GR process
##### GR gene distribution plots
gene_trajectory_GR_RNA_new -> gene_trajectory
gene_trajectory <- gene_trajectory[order(gene_trajectory$Pseudoorder1),]
GR_TFs <- c("BCL6", "FOXO1", "CEBPB", "GTF2IRD1", "JUNB", "RARB", "THRA")
GR_TF_gene_lists <- list()
for (i in 1:length(GR_TFs)){
GR_TF_gene_lists[[GR_TFs[i]]] <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == GR_TFs[i])])
}
freq_table <- table(do.call(c,GR_TF_gene_lists))
freq = as.integer(freq_table)
names(freq) = names(freq_table)
ggplot_list <- list()
for (i in 1:length(GR_TFs)){
gene_set <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == GR_TFs[i])])
GR_genes_df <- data.frame(genes = rownames(gene_trajectory),
group = NA)
GR_genes_df$freq <- freq[rownames(gene_trajectory)]
GR_genes_df$group[which(GR_genes_df$genes %in% gene_set)] = GR_TFs[i]
GR_genes_df$index <- 1:nrow(GR_genes_df)
ggplot_list[[i]] <-
ggplot(GR_genes_df %>% filter(!is.na(group) & freq <= 100), aes(x = index, fill = group)) +
geom_density(linetype = 1, alpha = 0.75) + xlim(c(1, nrow(gene_trajectory))) +
scale_fill_manual(values = c("darkblue", "orange", "forestgreen"))+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
ggtitle(GR_TFs[i])
}
print(ggpubr::ggarrange(plotlist = ggplot_list,
ncol = 4, nrow = 2, align = "v"))
